{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following code is intended for selecting from SPH 12 lead ecg dataset the requiered parameters for developing algorithms for biological age estimation using ecg signals. The processing is made on SPH dataset metadata.csv file. The involve fetures are:\n",
    "metadata['ECG_ID]\n",
    "metadata['AHA_Code] == '1' (normal ecg)\n",
    "metadata['Age] (or related features that could contains modifications related with intervals rather than values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import required packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read SHP dataset metadata.csv file in DataFrame form"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "ecg_metadata = pd.read_csv('E:/1-DENIS/Biomarkers/SPH dataset/metadata.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ECG_ID</th>\n",
       "      <th>AHA_Code</th>\n",
       "      <th>Patient_ID</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>N</th>\n",
       "      <th>Date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A00001</td>\n",
       "      <td>22;23</td>\n",
       "      <td>S00001</td>\n",
       "      <td>55</td>\n",
       "      <td>M</td>\n",
       "      <td>5000</td>\n",
       "      <td>2020-03-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A00002</td>\n",
       "      <td>1</td>\n",
       "      <td>S00002</td>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>6000</td>\n",
       "      <td>2019-09-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A00003</td>\n",
       "      <td>1</td>\n",
       "      <td>S00003</td>\n",
       "      <td>63</td>\n",
       "      <td>M</td>\n",
       "      <td>6500</td>\n",
       "      <td>2020-07-16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A00004</td>\n",
       "      <td>23</td>\n",
       "      <td>S00004</td>\n",
       "      <td>31</td>\n",
       "      <td>M</td>\n",
       "      <td>5000</td>\n",
       "      <td>2020-07-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A00005</td>\n",
       "      <td>146</td>\n",
       "      <td>S00005</td>\n",
       "      <td>47</td>\n",
       "      <td>M</td>\n",
       "      <td>5500</td>\n",
       "      <td>2020-01-07</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ECG_ID AHA_Code Patient_ID  Age Sex     N        Date\n",
       "0  A00001    22;23     S00001   55   M  5000  2020-03-04\n",
       "1  A00002        1     S00002   32   M  6000  2019-09-03\n",
       "2  A00003        1     S00003   63   M  6500  2020-07-16\n",
       "3  A00004       23     S00004   31   M  5000  2020-07-14\n",
       "4  A00005      146     S00005   47   M  5500  2020-01-07"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ecg_metadata.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method: remove non primary AHA_Code (for more information refer to SPH dataset related article)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_nonprimary_code(x):\n",
    "    \"\"\"Remove non-primary statement\"\"\"\n",
    "    r = []\n",
    "    for cx in x:\n",
    "        for c in cx.split('+'):\n",
    "            if int(c) < 200 or int(c) >= 500:\n",
    "                if c not in r:\n",
    "                    r.append(c)\n",
    "    return r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes = ecg_metadata['AHA_Code'].str.split(';')\n",
    "primary_codes = codes.apply(remove_nonprimary_code)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Include in the DataFrame the new column representing the primary codes (primary_codes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "ecg_metadata['primary_codes'] = primary_codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ECG_ID</th>\n",
       "      <th>AHA_Code</th>\n",
       "      <th>Patient_ID</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>N</th>\n",
       "      <th>Date</th>\n",
       "      <th>primary_codes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A00001</td>\n",
       "      <td>22;23</td>\n",
       "      <td>S00001</td>\n",
       "      <td>55</td>\n",
       "      <td>M</td>\n",
       "      <td>5000</td>\n",
       "      <td>2020-03-04</td>\n",
       "      <td>[22, 23]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A00002</td>\n",
       "      <td>1</td>\n",
       "      <td>S00002</td>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>6000</td>\n",
       "      <td>2019-09-03</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A00003</td>\n",
       "      <td>1</td>\n",
       "      <td>S00003</td>\n",
       "      <td>63</td>\n",
       "      <td>M</td>\n",
       "      <td>6500</td>\n",
       "      <td>2020-07-16</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A00004</td>\n",
       "      <td>23</td>\n",
       "      <td>S00004</td>\n",
       "      <td>31</td>\n",
       "      <td>M</td>\n",
       "      <td>5000</td>\n",
       "      <td>2020-07-14</td>\n",
       "      <td>[23]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A00005</td>\n",
       "      <td>146</td>\n",
       "      <td>S00005</td>\n",
       "      <td>47</td>\n",
       "      <td>M</td>\n",
       "      <td>5500</td>\n",
       "      <td>2020-01-07</td>\n",
       "      <td>[146]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ECG_ID AHA_Code Patient_ID  Age Sex     N        Date primary_codes\n",
       "0  A00001    22;23     S00001   55   M  5000  2020-03-04      [22, 23]\n",
       "1  A00002        1     S00002   32   M  6000  2019-09-03           [1]\n",
       "2  A00003        1     S00003   63   M  6500  2020-07-16           [1]\n",
       "3  A00004       23     S00004   31   M  5000  2020-07-14          [23]\n",
       "4  A00005      146     S00005   47   M  5500  2020-01-07         [146]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ecg_metadata.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method: keep normal AHA_Code (AHA_Code == '1') and replace non normal AHA_Code for np.NaN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_code_1(x):\n",
    "    r = []\n",
    "    for cx in x:\n",
    "        for c in cx:\n",
    "            if c == '1' and len(cx) == 1:\n",
    "                r.append(c)\n",
    "            else:\n",
    "                r = np.NaN\n",
    "    return r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "normal_codes = ecg_metadata.primary_codes.apply(get_code_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        NaN\n",
       "1        [1]\n",
       "2        [1]\n",
       "3        NaN\n",
       "4        NaN\n",
       "        ... \n",
       "25765    NaN\n",
       "25766    NaN\n",
       "25767    NaN\n",
       "25768    NaN\n",
       "25769    NaN\n",
       "Name: primary_codes, Length: 25770, dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normal_codes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Include in the DataFrame the new column representing the normal codes (AHA_Code == '1') and NaN values for rest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "ecg_metadata['normal_codes'] = normal_codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ECG_ID</th>\n",
       "      <th>AHA_Code</th>\n",
       "      <th>Patient_ID</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>N</th>\n",
       "      <th>Date</th>\n",
       "      <th>primary_codes</th>\n",
       "      <th>normal_codes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A00001</td>\n",
       "      <td>22;23</td>\n",
       "      <td>S00001</td>\n",
       "      <td>55</td>\n",
       "      <td>M</td>\n",
       "      <td>5000</td>\n",
       "      <td>2020-03-04</td>\n",
       "      <td>[22, 23]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A00002</td>\n",
       "      <td>1</td>\n",
       "      <td>S00002</td>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>6000</td>\n",
       "      <td>2019-09-03</td>\n",
       "      <td>[1]</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A00003</td>\n",
       "      <td>1</td>\n",
       "      <td>S00003</td>\n",
       "      <td>63</td>\n",
       "      <td>M</td>\n",
       "      <td>6500</td>\n",
       "      <td>2020-07-16</td>\n",
       "      <td>[1]</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A00004</td>\n",
       "      <td>23</td>\n",
       "      <td>S00004</td>\n",
       "      <td>31</td>\n",
       "      <td>M</td>\n",
       "      <td>5000</td>\n",
       "      <td>2020-07-14</td>\n",
       "      <td>[23]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A00005</td>\n",
       "      <td>146</td>\n",
       "      <td>S00005</td>\n",
       "      <td>47</td>\n",
       "      <td>M</td>\n",
       "      <td>5500</td>\n",
       "      <td>2020-01-07</td>\n",
       "      <td>[146]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ECG_ID AHA_Code Patient_ID  Age Sex     N        Date primary_codes  \\\n",
       "0  A00001    22;23     S00001   55   M  5000  2020-03-04      [22, 23]   \n",
       "1  A00002        1     S00002   32   M  6000  2019-09-03           [1]   \n",
       "2  A00003        1     S00003   63   M  6500  2020-07-16           [1]   \n",
       "3  A00004       23     S00004   31   M  5000  2020-07-14          [23]   \n",
       "4  A00005      146     S00005   47   M  5500  2020-01-07         [146]   \n",
       "\n",
       "  normal_codes  \n",
       "0          NaN  \n",
       "1          [1]  \n",
       "2          [1]  \n",
       "3          NaN  \n",
       "4          NaN  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ecg_metadata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "ecg_normal = ecg_metadata.dropna(axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify count of AHA_Code == '1' in SPH 12 lead ecg dataset (Category == 'A', Code == '1', Primary Statement == 'Normal ECG', Count == 13905)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13905, 9)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ecg_normal.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "ecg_normal.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ECG_ID</th>\n",
       "      <th>AHA_Code</th>\n",
       "      <th>Patient_ID</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>N</th>\n",
       "      <th>Date</th>\n",
       "      <th>primary_codes</th>\n",
       "      <th>normal_codes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A00002</td>\n",
       "      <td>1</td>\n",
       "      <td>S00002</td>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>6000</td>\n",
       "      <td>2019-09-03</td>\n",
       "      <td>[1]</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A00003</td>\n",
       "      <td>1</td>\n",
       "      <td>S00003</td>\n",
       "      <td>63</td>\n",
       "      <td>M</td>\n",
       "      <td>6500</td>\n",
       "      <td>2020-07-16</td>\n",
       "      <td>[1]</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A00006</td>\n",
       "      <td>1</td>\n",
       "      <td>S00006</td>\n",
       "      <td>46</td>\n",
       "      <td>F</td>\n",
       "      <td>5000</td>\n",
       "      <td>2019-08-31</td>\n",
       "      <td>[1]</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A00008</td>\n",
       "      <td>1</td>\n",
       "      <td>S00008</td>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>5000</td>\n",
       "      <td>2019-10-02</td>\n",
       "      <td>[1]</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A00009</td>\n",
       "      <td>1</td>\n",
       "      <td>S00009</td>\n",
       "      <td>48</td>\n",
       "      <td>F</td>\n",
       "      <td>6000</td>\n",
       "      <td>2019-08-20</td>\n",
       "      <td>[1]</td>\n",
       "      <td>[1]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ECG_ID AHA_Code Patient_ID  Age Sex     N        Date primary_codes  \\\n",
       "0  A00002        1     S00002   32   M  6000  2019-09-03           [1]   \n",
       "1  A00003        1     S00003   63   M  6500  2020-07-16           [1]   \n",
       "2  A00006        1     S00006   46   F  5000  2019-08-31           [1]   \n",
       "3  A00008        1     S00008   32   M  5000  2019-10-02           [1]   \n",
       "4  A00009        1     S00009   48   F  6000  2019-08-20           [1]   \n",
       "\n",
       "  normal_codes  \n",
       "0          [1]  \n",
       "1          [1]  \n",
       "2          [1]  \n",
       "3          [1]  \n",
       "4          [1]  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ecg_normal.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 13905 entries, 0 to 13904\n",
      "Data columns (total 9 columns):\n",
      " #   Column         Non-Null Count  Dtype \n",
      "---  ------         --------------  ----- \n",
      " 0   ECG_ID         13905 non-null  object\n",
      " 1   AHA_Code       13905 non-null  object\n",
      " 2   Patient_ID     13905 non-null  object\n",
      " 3   Age            13905 non-null  int64 \n",
      " 4   Sex            13905 non-null  object\n",
      " 5   N              13905 non-null  int64 \n",
      " 6   Date           13905 non-null  object\n",
      " 7   primary_codes  13905 non-null  object\n",
      " 8   normal_codes   13905 non-null  object\n",
      "dtypes: int64(2), object(7)\n",
      "memory usage: 977.8+ KB\n"
     ]
    }
   ],
   "source": [
    "ecg_normal.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method: conform new column of age clases based on intervals define on age feature and describe on SPH ecg dataset age distribution. Clases are labels from 0 to 8 ([10, 20), [20, 30)...[90, 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def define_age_intervals_0(i):\n",
    "    if i >= 10 and i < 20:\n",
    "        return 0\n",
    "    elif i >= 20 and i < 30:\n",
    "        return 1\n",
    "    elif i >= 30 and i < 40:\n",
    "        return 2\n",
    "    elif i >= 40 and i < 50:\n",
    "        return 3\n",
    "    elif i >= 50 and i < 60:\n",
    "        return 4\n",
    "    elif i >= 60 and i < 70:\n",
    "        return 5\n",
    "    elif i >= 70 and i < 80:\n",
    "        return 6\n",
    "    elif i >= 80 and i < 90:\n",
    "        return 7\n",
    "    elif i >= 90 and i < 100:\n",
    "        return 8\n",
    "    else:\n",
    "        return np.NaN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_0 = ecg_normal['Age'].apply(define_age_intervals_0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_0.name = 'Age_class_0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        2\n",
       "1        5\n",
       "2        3\n",
       "3        2\n",
       "4        3\n",
       "        ..\n",
       "13900    3\n",
       "13901    6\n",
       "13902    4\n",
       "13903    1\n",
       "13904    4\n",
       "Name: Age_class_0, Length: 13905, dtype: int64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_intervals_0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Spread age intervals in order to include age clases more imprecise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def define_age_intervals_1(i):\n",
    "    if i >= 10 and i < 30:\n",
    "        return 0\n",
    "    elif i >= 30 and i < 50:\n",
    "        return 1\n",
    "    elif i >= 50 and i < 70:\n",
    "        return 2\n",
    "    elif i >= 70 and i < 90:\n",
    "        return 3\n",
    "    elif i >= 90 and i < 100:\n",
    "        return 4\n",
    "    else:\n",
    "        return np.NaN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_1 = ecg_normal['Age'].apply(define_age_intervals_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_1.name = 'Age_class_1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        1\n",
       "1        2\n",
       "2        1\n",
       "3        1\n",
       "4        1\n",
       "        ..\n",
       "13900    1\n",
       "13901    3\n",
       "13902    2\n",
       "13903    0\n",
       "13904    2\n",
       "Name: Age_class_1, Length: 13905, dtype: int64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_intervals_1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Conform dataframe with representative data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Spread age intervals in order to include age clases more imprecise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "def define_age_intervals_2(i):\n",
    "    if i >= 10 and i < 40:\n",
    "        return 0\n",
    "    elif i >= 40 and i < 70:\n",
    "        return 1\n",
    "    elif i >= 70 and i < 100:\n",
    "        return 2\n",
    "    else:\n",
    "        return np.NaN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_2 = ecg_normal['Age'].apply(define_age_intervals_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_2.name = 'Age_class_2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        0\n",
       "1        1\n",
       "2        1\n",
       "3        0\n",
       "4        1\n",
       "        ..\n",
       "13900    1\n",
       "13901    2\n",
       "13902    1\n",
       "13903    0\n",
       "13904    1\n",
       "Name: Age_class_2, Length: 13905, dtype: int64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_intervals_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Spread age intervals in order to include age clases more imprecise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def define_age_intervals_3(i):\n",
    "    if i >= 10 and i < 50:\n",
    "        return 0\n",
    "    elif i >= 50 and i < 100:\n",
    "        return 1\n",
    "    else:\n",
    "        return np.NaN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_3 = ecg_normal['Age'].apply(define_age_intervals_3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_intervals_3.name = 'Age_class_3'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        0\n",
       "1        1\n",
       "2        0\n",
       "3        0\n",
       "4        0\n",
       "        ..\n",
       "13900    0\n",
       "13901    1\n",
       "13902    1\n",
       "13903    0\n",
       "13904    1\n",
       "Name: Age_class_3, Length: 13905, dtype: int64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_intervals_3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Conform dataframe with columns ECG_ID, Age, Age_class_0, Age_class_1, Age_class_2, Age_class_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "normal_ecg_age = pd.concat([ecg_normal['ECG_ID'], ecg_normal['Age'], age_intervals_0, age_intervals_1, age_intervals_2, age_intervals_3], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ECG_ID</th>\n",
       "      <th>Age</th>\n",
       "      <th>Age_class_0</th>\n",
       "      <th>Age_class_1</th>\n",
       "      <th>Age_class_2</th>\n",
       "      <th>Age_class_3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A00002</td>\n",
       "      <td>32</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A00003</td>\n",
       "      <td>63</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A00006</td>\n",
       "      <td>46</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A00008</td>\n",
       "      <td>32</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A00009</td>\n",
       "      <td>48</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ECG_ID  Age  Age_class_0  Age_class_1  Age_class_2  Age_class_3\n",
       "0  A00002   32            2            1            0            0\n",
       "1  A00003   63            5            2            1            1\n",
       "2  A00006   46            3            1            1            0\n",
       "3  A00008   32            2            1            0            0\n",
       "4  A00009   48            3            1            1            0"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normal_ecg_age.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save normal_ecg_age dataframe in pickle format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# salvar normal_ecg_age en formato pickle\n",
    "normal_ecg_age.to_pickle('normal_ecg_age.pickle')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.5 ('ecg_biomarkers': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "vscode": {
   "interpreter": {
    "hash": "d39f5264c2f26cb0964d5a98dbbf3a9d2bc7301339817434e4f0ffa82f5f047f"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
